knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(ggplot2)
nypd = read.csv("https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv?accessType=DOWNLOAD")
#Remove the columns that I am not interested in for this project
nypd <- nypd %>%
select(-c(Latitude, Longitude, Lon_Lat, X_COORD_CD, Y_COORD_CD))
#Create some new columns that I will use for analysis
nypd <- nypd %>%
mutate(OCCUR_DATE = mdy(OCCUR_DATE),
OCCUR_TIME = hms(OCCUR_TIME),
OCCUR_HOUR = as.integer(OCCUR_TIME@hour),
OCCUR_MONTH = as.integer(month(OCCUR_DATE)),
OCCUR_YEAR = year(OCCUR_DATE),
STATISTICAL_MURDER_FLAG = as.logical(STATISTICAL_MURDER_FLAG),
SHOOTINGS = 1,
MURDER = as.integer(STATISTICAL_MURDER_FLAG))
#Handle missing data
nypd[nypd == ''] <- NA
nypd <- nypd %>%
replace_na(list(PERP_AGE_GROUP = "UNKNOW", PERP_RACE = "UNKNOWN", PERP_SEX = "UNKNOWN", VIC_AGE_GROUP = "UNKNOWN", VIC_SEX="UNKNOWN", VIC_RACE="UNKNOWN"))
#color plot
ggplot(nypd, aes(x=as.factor(BORO), fill=as.factor(BORO))) +
geom_bar() +
scale_fill_brewer(palette = "Set1") +
theme(legend.position="none")
#Barplot by Boro, stacked by murder
ggplot(nypd, aes(BORO, fill = STATISTICAL_MURDER_FLAG)) +
geom_bar(position = "stack")
#Barplot by Boro, stacked by murder percentage
ggplot(nypd, aes(BORO, fill = STATISTICAL_MURDER_FLAG)) +
geom_bar(position = "fill")
#Create a new df for time series analysis
nypd_annual <- nypd %>%
group_by(OCCUR_YEAR, SHOOTINGS)%>%
summarize(SHOOTINGS = sum(SHOOTINGS),
STATISTICAL_MURDER_FLAG = sum(STATISTICAL_MURDER_FLAG)) %>%
select(OCCUR_YEAR, SHOOTINGS, STATISTICAL_MURDER_FLAG) %>%
ungroup()
## `summarise()` has grouped output by 'OCCUR_YEAR'. You can override using the
## `.groups` argument.
nypd_annual_murder <- nypd %>%
group_by(OCCUR_YEAR, SHOOTINGS)%>%
summarize(SHOOTINGS = sum(SHOOTINGS),
MURDER = sum(MURDER)) %>%
select(OCCUR_YEAR, SHOOTINGS, MURDER) %>%
ungroup()
## `summarise()` has grouped output by 'OCCUR_YEAR'. You can override using the
## `.groups` argument.
#Create a new df for boro time series analysis
nypd_annual_boro <- nypd %>%
group_by(OCCUR_YEAR, SHOOTINGS, BORO)%>%
summarize(SHOOTINGS = sum(SHOOTINGS),
STATISTICAL_MURDER_FLAG = sum(STATISTICAL_MURDER_FLAG)) %>%
select(OCCUR_YEAR, BORO, SHOOTINGS, STATISTICAL_MURDER_FLAG) %>%
ungroup()
## `summarise()` has grouped output by 'OCCUR_YEAR', 'SHOOTINGS'. You can override
## using the `.groups` argument.
#Timeseries lineplot
ggplot(nypd_annual, aes(x=OCCUR_YEAR)) +
geom_line(aes(y=SHOOTINGS)) +
geom_point(aes(y=SHOOTINGS)) +
geom_line(aes(y=STATISTICAL_MURDER_FLAG), color="red") +
geom_point(aes(y=STATISTICAL_MURDER_FLAG), color="red") +
labs(title = "NYPD Shootings (Black) and Murders (Red) by Year",
x = "Year",
y = "Shootings/Murders")
#Timeseries by boro
ggplot(nypd_annual_boro, aes(x=OCCUR_YEAR, y=SHOOTINGS, color=BORO)) +
geom_line() +
geom_point() +
labs(title = "NYPD Shootings by Year by Boro",
x = "Year",
y = "Shootings")
#histogram by hour
ggplot(nypd, aes(x=as.factor(OCCUR_HOUR))) +
geom_bar(fill="blue")
#histogram by month
ggplot(nypd, aes(x=as.factor(OCCUR_MONTH))) +
geom_bar(fill="red")
#2d histogram by month and time of day
fig1 <- plot_ly(x = nypd$OCCUR_HOUR, y = nypd$OCCUR_MONTH, type = "histogram2dcontour")
fig1 <- fig1 %>%
colorbar(title = "Shootings") %>%
layout(title = 'Time of Day vs. Month', plot_bgcolor = "#e5ecf6", xaxis=list(title = "Time (Local)"), yaxis = list(title = "Month"))
fig1
#putting it all together
s <- subplot(
plot_ly(x = nypd$OCCUR_HOUR, type = "histogram"),
plotly_empty(),
plot_ly(x = nypd$OCCUR_HOUR, y = nypd$OCCUR_MONTH, type = "histogram2dcontour"),
plot_ly(y = nypd$OCCUR_MONTH, type = "histogram"),
nrows = 2, heights = c(0.2, 0.8), widths = c(0.8, 0.2), margin = 0,
shareX = TRUE, shareY = TRUE, titleX = FALSE, titleY = FALSE
)
## Warning: No trace type specified and no positional attributes specified
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
fig <- layout(s, showlegend = FALSE)
fig
In this report, I did some analysis on NYPD shooting trends. There appeared to be a steady decline in both shootings and murders until 2020, when there was a sharp increase. Additionally, we examined the time of year and time of day when shootings are most likely to occur. The data show that the summertime and during the evening/nighttime hours are the most dangerous.
There are a few areas of potential bias in these data. There are likely shootings that go unreported to the NYPD, and hence would not be in this data. There may be some boroughs that report shootings at a higher rate than others.